{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Packages\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Python Data Analysis Library: http://pandas.pydata.org   \n",
    "statistical data visualization: http://seaborn.pydata.org   \n",
    "Python 2D plotting library: https://matplotlib.org"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 0. 数据载入 (Load Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load dataset\n",
    "# source: https://archive.ics.uci.edu/ml/datasets/wine+quality\n",
    "df = pd.read_csv('./input/winequality-red.csv', sep=';', quotechar='\"')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. 探索性分析 (Exploratory Analysis)   \n",
    "\n",
    "掌握数据的基本情况：\n",
    "- 数据量\n",
    "- 特征数\n",
    "- 目标变量\n",
    "- 数据类型(整数、浮点数等)\n",
    "- 取值范围（最大值、最小值、平均值、中位值等）\n",
    "- 数值分布\n",
    "- 相关度\n",
    "- ......"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fixed acidity</th>\n",
       "      <th>volatile acidity</th>\n",
       "      <th>citric acid</th>\n",
       "      <th>residual sugar</th>\n",
       "      <th>chlorides</th>\n",
       "      <th>free sulfur dioxide</th>\n",
       "      <th>total sulfur dioxide</th>\n",
       "      <th>density</th>\n",
       "      <th>pH</th>\n",
       "      <th>sulphates</th>\n",
       "      <th>alcohol</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.098</td>\n",
       "      <td>25.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>0.9968</td>\n",
       "      <td>3.20</td>\n",
       "      <td>0.68</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.76</td>\n",
       "      <td>0.04</td>\n",
       "      <td>2.3</td>\n",
       "      <td>0.092</td>\n",
       "      <td>15.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.9970</td>\n",
       "      <td>3.26</td>\n",
       "      <td>0.65</td>\n",
       "      <td>9.8</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.2</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.56</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.075</td>\n",
       "      <td>17.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.9980</td>\n",
       "      <td>3.16</td>\n",
       "      <td>0.58</td>\n",
       "      <td>9.8</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \\\n",
       "0            7.4              0.70         0.00             1.9      0.076   \n",
       "1            7.8              0.88         0.00             2.6      0.098   \n",
       "2            7.8              0.76         0.04             2.3      0.092   \n",
       "3           11.2              0.28         0.56             1.9      0.075   \n",
       "4            7.4              0.70         0.00             1.9      0.076   \n",
       "\n",
       "   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \\\n",
       "0                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "1                 25.0                  67.0   0.9968  3.20       0.68   \n",
       "2                 15.0                  54.0   0.9970  3.26       0.65   \n",
       "3                 17.0                  60.0   0.9980  3.16       0.58   \n",
       "4                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "\n",
       "   alcohol  quality  \n",
       "0      9.4        5  \n",
       "1      9.8        5  \n",
       "2      9.8        5  \n",
       "3      9.8        6  \n",
       "4      9.4        5  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# head samples\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1599 entries, 0 to 1598\n",
      "Data columns (total 12 columns):\n",
      "fixed acidity           1599 non-null float64\n",
      "volatile acidity        1599 non-null float64\n",
      "citric acid             1599 non-null float64\n",
      "residual sugar          1599 non-null float64\n",
      "chlorides               1599 non-null float64\n",
      "free sulfur dioxide     1599 non-null float64\n",
      "total sulfur dioxide    1599 non-null float64\n",
      "density                 1599 non-null float64\n",
      "pH                      1599 non-null float64\n",
      "sulphates               1599 non-null float64\n",
      "alcohol                 1599 non-null float64\n",
      "quality                 1599 non-null int64\n",
      "dtypes: float64(11), int64(1)\n",
      "memory usage: 150.0 KB\n"
     ]
    }
   ],
   "source": [
    "# Information about the data\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fixed acidity</th>\n",
       "      <th>volatile acidity</th>\n",
       "      <th>citric acid</th>\n",
       "      <th>residual sugar</th>\n",
       "      <th>chlorides</th>\n",
       "      <th>free sulfur dioxide</th>\n",
       "      <th>total sulfur dioxide</th>\n",
       "      <th>density</th>\n",
       "      <th>pH</th>\n",
       "      <th>sulphates</th>\n",
       "      <th>alcohol</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "      <td>1599.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>8.319637</td>\n",
       "      <td>0.527821</td>\n",
       "      <td>0.270976</td>\n",
       "      <td>2.538806</td>\n",
       "      <td>0.087467</td>\n",
       "      <td>15.874922</td>\n",
       "      <td>46.467792</td>\n",
       "      <td>0.996747</td>\n",
       "      <td>3.311113</td>\n",
       "      <td>0.658149</td>\n",
       "      <td>10.422983</td>\n",
       "      <td>5.636023</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.741096</td>\n",
       "      <td>0.179060</td>\n",
       "      <td>0.194801</td>\n",
       "      <td>1.409928</td>\n",
       "      <td>0.047065</td>\n",
       "      <td>10.460157</td>\n",
       "      <td>32.895324</td>\n",
       "      <td>0.001887</td>\n",
       "      <td>0.154386</td>\n",
       "      <td>0.169507</td>\n",
       "      <td>1.065668</td>\n",
       "      <td>0.807569</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>4.600000</td>\n",
       "      <td>0.120000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.900000</td>\n",
       "      <td>0.012000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>0.990070</td>\n",
       "      <td>2.740000</td>\n",
       "      <td>0.330000</td>\n",
       "      <td>8.400000</td>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>7.100000</td>\n",
       "      <td>0.390000</td>\n",
       "      <td>0.090000</td>\n",
       "      <td>1.900000</td>\n",
       "      <td>0.070000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>0.995600</td>\n",
       "      <td>3.210000</td>\n",
       "      <td>0.550000</td>\n",
       "      <td>9.500000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>7.900000</td>\n",
       "      <td>0.520000</td>\n",
       "      <td>0.260000</td>\n",
       "      <td>2.200000</td>\n",
       "      <td>0.079000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>0.996750</td>\n",
       "      <td>3.310000</td>\n",
       "      <td>0.620000</td>\n",
       "      <td>10.200000</td>\n",
       "      <td>6.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>9.200000</td>\n",
       "      <td>0.640000</td>\n",
       "      <td>0.420000</td>\n",
       "      <td>2.600000</td>\n",
       "      <td>0.090000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>62.000000</td>\n",
       "      <td>0.997835</td>\n",
       "      <td>3.400000</td>\n",
       "      <td>0.730000</td>\n",
       "      <td>11.100000</td>\n",
       "      <td>6.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>15.900000</td>\n",
       "      <td>1.580000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>15.500000</td>\n",
       "      <td>0.611000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>289.000000</td>\n",
       "      <td>1.003690</td>\n",
       "      <td>4.010000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>14.900000</td>\n",
       "      <td>8.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       fixed acidity  volatile acidity  citric acid  residual sugar  \\\n",
       "count    1599.000000       1599.000000  1599.000000     1599.000000   \n",
       "mean        8.319637          0.527821     0.270976        2.538806   \n",
       "std         1.741096          0.179060     0.194801        1.409928   \n",
       "min         4.600000          0.120000     0.000000        0.900000   \n",
       "25%         7.100000          0.390000     0.090000        1.900000   \n",
       "50%         7.900000          0.520000     0.260000        2.200000   \n",
       "75%         9.200000          0.640000     0.420000        2.600000   \n",
       "max        15.900000          1.580000     1.000000       15.500000   \n",
       "\n",
       "         chlorides  free sulfur dioxide  total sulfur dioxide      density  \\\n",
       "count  1599.000000          1599.000000           1599.000000  1599.000000   \n",
       "mean      0.087467            15.874922             46.467792     0.996747   \n",
       "std       0.047065            10.460157             32.895324     0.001887   \n",
       "min       0.012000             1.000000              6.000000     0.990070   \n",
       "25%       0.070000             7.000000             22.000000     0.995600   \n",
       "50%       0.079000            14.000000             38.000000     0.996750   \n",
       "75%       0.090000            21.000000             62.000000     0.997835   \n",
       "max       0.611000            72.000000            289.000000     1.003690   \n",
       "\n",
       "                pH    sulphates      alcohol      quality  \n",
       "count  1599.000000  1599.000000  1599.000000  1599.000000  \n",
       "mean      3.311113     0.658149    10.422983     5.636023  \n",
       "std       0.154386     0.169507     1.065668     0.807569  \n",
       "min       2.740000     0.330000     8.400000     3.000000  \n",
       "25%       3.210000     0.550000     9.500000     5.000000  \n",
       "50%       3.310000     0.620000    10.200000     6.000000  \n",
       "75%       3.400000     0.730000    11.100000     6.000000  \n",
       "max       4.010000     2.000000    14.900000     8.000000  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Describe the dataset\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([5, 6, 7, 4, 8, 3])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['quality'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5    681\n",
       "6    638\n",
       "7    199\n",
       "4     53\n",
       "8     18\n",
       "3     10\n",
       "Name: quality, dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['quality'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x10a037810>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEKCAYAAAAIO8L1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFCZJREFUeJzt3X+w5XV93/HnSxaCUHFBLhR3oavJlsRJIuAdSsL4o25MhShLU3FwqmwonbUdwhibNiXNTJq0yYyZmqiQlA6B4K4xKsEQVoex0lXU6IAuPwIIOqwE2ZvF3VUBRWos+u4f53OHy+6H3bNwv/fcvft8zJz5fr+f7+d7zvsMw33t5/vjc1JVSJK0u+dNugBJ0uJkQEiSugwISVKXASFJ6jIgJEldBoQkqcuAkCR1GRCSpC4DQpLUtWzSBTwXxx57bK1atWrSZUjSAeW22277ZlVN7avfYAGR5GTgI3OaXgr8NrCxta8CHgTeXFWPJAnwPuBs4AngV6rq9r19xqpVq9iyZcv8Fy9JS1iSr4/Tb7BTTFX11ao6papOAV7B6I/+9cClwOaqWg1sbtsAZwGr22s9cMVQtUmS9m2hrkGsAb5WVV8H1gIbWvsG4Ny2vhbYWCO3AMuTnLBA9UmSdrNQAXE+8KG2fnxVPQzQlse19hXAtjnHzLQ2SdIEDB4QSQ4DzgH+cl9dO217zEWeZH2SLUm27Nq1az5KlCR1LMQI4izg9qra0bZ3zJ46asudrX0GOHHOcSuB7bu/WVVdWVXTVTU9NbXPi/CSpGdpIQLiLTx1eglgE7Cura8DbpjTfkFGzgAemz0VJUlaeIM+B5HkCOB1wNvnNL8LuDbJRcBDwHmt/UZGt7huZXTH04VD1iZJ2rtBA6KqngBetFvbtxjd1bR73wIuHrIeSdL4nGpDktR1QE+1oaXnzMvPnHQJ++3zl3x+0iVIg3AEIUnqMiAkSV0GhCSpy4CQJHUZEJKkLgNCktRlQEiSugwISVKXASFJ6jIgJEldBoQkqcuAkCR1OVmftIA+86pXT7qE/fbqz35m0iVoQhxBSJK6DAhJUpcBIUnqMiAkSV0GhCSpy4CQJHUZEJKkrkEDIsnyJNcl+UqS+5L8XJJjktyU5P62PLr1TZLLkmxNcleS04asTZK0d0OPIN4HfKKqfhJ4OXAfcCmwuapWA5vbNsBZwOr2Wg9cMXBtkqS9GCwgkhwFvAq4GqCqflBVjwJrgQ2t2wbg3La+FthYI7cAy5OcMFR9kqS9G3IE8VJgF3BNkjuSXJXkSOD4qnoYoC2Pa/1XANvmHD/T2p4myfokW5Js2bVr14DlS9LBbciAWAacBlxRVacC3+Op00k96bTVHg1VV1bVdFVNT01NzU+lkqQ9DBkQM8BMVd3atq9jFBg7Zk8dteXOOf1PnHP8SmD7gPVJkvZisICoqm8A25Kc3JrWAPcCm4B1rW0dcENb3wRc0O5mOgN4bPZUlCRp4Q093fclwAeTHAY8AFzIKJSuTXIR8BBwXut7I3A2sBV4ovWVJE3IoAFRVXcC051dazp9C7h4yHokSePzSWpJUpcBIUnqMiAkSV0GhCSpy4CQJHUZEJKkLgNCktRlQEiSugwISVKXASFJ6jIgJEldBoQkqcuAkCR1GRCSpC4DQpLUZUBIkroMCElSlwEhSeoyICRJXQaEJKnLgJAkdQ0aEEkeTHJ3kjuTbGltxyS5Kcn9bXl0a0+Sy5JsTXJXktOGrE2StHcLMYL451V1SlVNt+1Lgc1VtRrY3LYBzgJWt9d64IoFqE2S9AwmcYppLbChrW8Azp3TvrFGbgGWJzlhAvVJkhg+IAr4ZJLbkqxvbcdX1cMAbXlca18BbJtz7ExrkyRNwLKB3//Mqtqe5DjgpiRf2UvfdNpqj06joFkPcNJJJ81PlZKkPQw6gqiq7W25E7geOB3YMXvqqC13tu4zwIlzDl8JbO+855VVNV1V01NTU0OWL0kHtcECIsmRSV4wuw78InAPsAlY17qtA25o65uAC9rdTGcAj82eipIkLbwhTzEdD1yfZPZz/qKqPpHkS8C1SS4CHgLOa/1vBM4GtgJPABcOWJskaR8GC4iqegB4eaf9W8CaTnsBFw9VjyRp//gktSSpy4CQJHUZEJKkLgNCktRlQEiSugwISVKXASFJ6jIgJEldBoQkqcuAkCR1GRCSpC4DQpLUZUBIkroMCElSlwEhSeoyICRJXQaEJKnLgJAkdRkQkqQuA0KS1GVASJK6DAhJUtfgAZHkkCR3JPl4235JkluT3J/kI0kOa+0/1ra3tv2rhq5NkvTMFmIE8Q7gvjnbfwC8p6pWA48AF7X2i4BHquongPe0fpKkCRk0IJKsBH4JuKptB3gtcF3rsgE4t62vbdu0/Wtaf0nSBAw9gngv8BvAj9r2i4BHq+rJtj0DrGjrK4BtAG3/Y63/0yRZn2RLki27du0asnZJOqiNFRBJNo/Tttv+NwA7q+q2uc2drjXGvqcaqq6squmqmp6amtpbCZKk52DZ3nYmORw4Ajg2ydE89Uf8KODF+3jvM4FzkpwNHN6OeS+wPMmyNkpYCWxv/WeAE4GZJMuAFwLf3v+vJEmaD/saQbwduA34ybacfd0A/MneDqyq36yqlVW1Cjgf+FRV/Wvg08CbWrd17b0ANrVt2v5PVdUeIwhJ0sLY6wiiqt4HvC/JJVV1+Tx95n8GPpzk94A7gKtb+9XAB5JsZTRyOH+ePk+S9CzsNSBmVdXlSX4eWDX3mKraOObxNwM3t/UHgNM7fb4PnDfO+0mShjdWQCT5APDjwJ3AD1tzAWMFhCTpwDNWQADTwMu8JiBJB49xn4O4B/jHQxYiSVpcxh1BHAvcm+SLwD/MNlbVOYNUJUmauHED4neGLEKStPiMexfTZ4YuRJK0uIx7F9N3eWrai8OAQ4HvVdVRQxUmSZqscUcQL5i7neRcOs8ySJKWjmc1m2tV/TWjabslSUvUuKeYfnnO5vMYPRfhMxGStISNexfTG+esPwk8yOgHfiRJS9S41yAuHLoQSdLiMu4PBq1Mcn2SnUl2JPlo+zlRSdISNe5F6msY/V7Dixn9NOjHWpskaYkaNyCmquqaqnqyvd4P+HufkrSEjRsQ30zy1iSHtNdbgW8NWZgkabLGDYh/A7wZ+AbwMKOfBPXCtSQtYePe5vrfgXVV9QhAkmOAdzMKDknSEjTuCOJnZ8MBoKq+DZw6TEmSpMVg3IB4XpKjZzfaCGLc0Yck6QA07h/5PwS+kOQ6RlNsvBn4/cGqkiRN3FgjiKraCPwrYAewC/jlqvrA3o5JcniSLyb52yRfTvK7rf0lSW5Ncn+SjyQ5rLX/WNve2vavei5fTJL03Iw9m2tV3VtVf1xVl1fVvWMc8g/Aa6vq5cApwOuTnAH8AfCeqloNPAJc1PpfBDxSVT8BvKf1kyRNyLOa7nscNfJ42zy0vYrRNOHXtfYNwLltfW3bpu1fkyRD1SdJ2rvBAgKgPVR3J7ATuAn4GvBoVT3ZuswwmrqDttwG0PY/BrxoyPokSc9s0ICoqh9W1SnASka/QPdTvW5t2Rst7PGbE0nWJ9mSZMuuXbvmr1hJ0tMMGhCzqupR4GbgDGB5ktm7p1YC29v6DHAiQNv/QuDbnfe6sqqmq2p6asrpoCRpKIMFRJKpJMvb+vOBXwDuAz7NaKoOgHXADW19U9um7f9UVfmrdZI0IUM+7HYCsCHJIYyC6Nqq+niSe4EPJ/k94A7g6tb/auADSbYyGjmcP2BtkqR9GCwgquouOtNxVNUDjK5H7N7+feC8oeqRJO2fBbkGIUk68BgQkqQuA0KS1GVASJK6DAhJUpcBIUnqMiAkSV0GhCSpy4CQJHUZEJKkLgNCktRlQEiSugwISVKXASFJ6jIgJEldBoQkqcuAkCR1GRCSpC4DQpLUZUBIkroMCElS12ABkeTEJJ9Ocl+SLyd5R2s/JslNSe5vy6Nbe5JclmRrkruSnDZUbZKkfRtyBPEk8OtV9VPAGcDFSV4GXApsrqrVwOa2DXAWsLq91gNXDFibJGkfBguIqnq4qm5v698F7gNWAGuBDa3bBuDctr4W2FgjtwDLk5wwVH2SpL1bkGsQSVYBpwK3AsdX1cMwChHguNZtBbBtzmEzrU2SNAHLhv6AJP8I+Cjwa1X1nSTP2LXTVp33W8/oFBQnnXTSfJUpaR788a9/bNIl7Jdf/cM3TrqERW3QEUSSQxmFwwer6q9a847ZU0dtubO1zwAnzjl8JbB99/esqiurarqqpqempoYrXpIOckPexRTgauC+qvqjObs2Aeva+jrghjntF7S7mc4AHps9FSVJWnhDnmI6E3gbcHeSO1vbfwHeBVyb5CLgIeC8tu9G4GxgK/AEcOGAtUmS9mGwgKiqv6F/XQFgTad/ARcPVY8kaf/4JLUkqcuAkCR1GRCSpC4DQpLUZUBIkroMCElSlwEhSeoyICRJXQaEJKnLgJAkdRkQkqQuA0KS1GVASJK6DAhJUpcBIUnqMiAkSV0GhCSpy4CQJHUZEJKkLgNCktRlQEiSugwISVLXYAGR5M+S7Exyz5y2Y5LclOT+tjy6tSfJZUm2JrkryWlD1SVJGs+QI4j3A6/fre1SYHNVrQY2t22As4DV7bUeuGLAuiRJYxgsIKrqs8C3d2teC2xo6xuAc+e0b6yRW4DlSU4YqjZJ0r4t9DWI46vqYYC2PK61rwC2zek309r2kGR9ki1JtuzatWvQYiXpYLZYLlKn01a9jlV1ZVVNV9X01NTUwGVJ0sFroQNix+ypo7bc2dpngBPn9FsJbF/g2iRJcyx0QGwC1rX1dcANc9ovaHcznQE8NnsqSpI0GcuGeuMkHwJeAxybZAb4r8C7gGuTXAQ8BJzXut8InA1sBZ4ALhyqLknSeAYLiKp6yzPsWtPpW8DFQ9UiSdp/i+UitSRpkTEgJEldBoQkqcuAkCR1GRCSpC4DQpLUZUBIkroMCElSlwEhSeoa7ElqDeOh//Yzky5hv53023dPugRJz4IjCElSlwEhSeoyICRJXQaEJKnLgJAkdRkQkqQuA0KS1GVASJK6DAhJUpdPUkvSmH7/rW+adAn75bf+/LrndLwjCElS16IKiCSvT/LVJFuTXDrpeiTpYLZoTjElOQT4E+B1wAzwpSSbqure/X2vV/ynjfNd3qBu+x8XTLoESdrDYhpBnA5sraoHquoHwIeBtROuSZIOWospIFYA2+Zsz7Q2SdIEpKomXQMASc4D/kVV/du2/Tbg9Kq6ZLd+64H1bfNk4KsLWOaxwDcX8PMWmt/vwLWUvxv4/ebbP6mqqX11WjTXIBiNGE6cs70S2L57p6q6ErhyoYqaK8mWqpqexGcvBL/fgWspfzfw+03KYjrF9CVgdZKXJDkMOB/YNOGaJOmgtWhGEFX1ZJJfBf43cAjwZ1X15QmXJUkHrUUTEABVdSNw46Tr2IuJnNpaQH6/A9dS/m7g95uIRXORWpK0uCymaxCSpEXEgBhDksOTfDHJ3yb5cpLfnXRN8y3JIUnuSPLxSdcy35I8mOTuJHcm2TLpeuZbkuVJrkvylST3Jfm5Sdc0X5Kc3P67zb6+k+TXJl3XfEnyzvY35Z4kH0py+KRrmstTTGNIEuDIqno8yaHA3wDvqKpbJlzavEnyH4Bp4KiqesOk65lPSR4EpqtqSd5Hn2QD8LmquqrdAXhEVT066brmW5uO5++Bf1ZVX590Pc9VkhWM/pa8rKr+b5JrgRur6v2TrewpjiDGUCOPt81D22vJJGuSlcAvAVdNuhbtnyRHAa8Crgaoqh8sxXBo1gBfWwrhMMcy4PlJlgFH0Hn2a5IMiDG1UzB3AjuBm6rq1knXNI/eC/wG8KNJFzKQAj6Z5Lb2JP5S8lJgF3BNO0V4VZIjJ13UQM4HPjTpIuZLVf098G7gIeBh4LGq+uRkq3o6A2JMVfXDqjqF0RPepyf56UnXNB+SvAHYWVW3TbqWAZ1ZVacBZwEXJ3nVpAuaR8uA04ArqupU4HvAkpsqv506Owf4y0nXMl+SHM1oQtKXAC8Gjkzy1slW9XQGxH5qw/ebgddPuJT5ciZwTjtP/2HgtUn+fLIlza+q2t6WO4HrGc0cvFTMADNzRrTXMQqMpeYs4Paq2jHpQubRLwB/V1W7qur/AX8F/PyEa3oaA2IMSaaSLG/rz2f0H/Yrk61qflTVb1bVyqpaxWgI/6mqWlT/inkukhyZ5AWz68AvAvdMtqr5U1XfALYlObk1rQH2+zdUDgBvYQmdXmoeAs5IckS7EWYNcN+Ea3qaRfUk9SJ2ArCh3UXxPODaqlpyt4MuUccD14/+/2MZ8BdV9YnJljTvLgE+2E7DPABcOOF65lWSIxj9kNjbJ13LfKqqW5NcB9wOPAncwSJ7otrbXCVJXZ5ikiR1GRCSpC4DQpLUZUBIkroMCElSlwEhDSjJqiT3tPXpJJe19dckWVQPRUm78zkIaYFU1RZgdrrx1wCPA1+YWEHSPjiCkJ5Bkt9K8tUk/6fN1f8fk9ycZLrtP7ZNUTI7Uvhcktvba4/RQRs1fDzJKuDfAe9sv3HwyiR/16aSJ8lR7TcsDl2wLyt1OIKQOpK8gtHUI6cy+v/kdmBvExruBF5XVd9PsprRtBDTvY5V9WCS/wU8XlXvbp93M6Mp1/+6fe5H2/w80sQ4gpD6XglcX1VPVNV3gE376H8o8KdJ7mY04+jL9vPzruKpKTIuBK7Zz+OleecIQnpmvXlonuSpf1jN/XnIdwI7gJe3/d/frw+q+nw7TfVq4JCqWjITCurA5QhC6vss8C+TPL/NBvvG1v4g8Iq2/qY5/V8IPFxVPwLeBhyyj/f/LvCC3do2Mjo15ehBi4IBIXVU1e3AR4A7gY8Cn2u73g38+yRfAI6dc8j/BNYluQX4p4x+uGdvPsYogO5M8srW9kHgaJbetNY6QDmbqzSGJL/DnIvKA33Gm4C1VfW2oT5D2h9eg5AWgSSXM/rVtLMnXYs0yxGEJKnLaxCSpC4DQpLUZUBIkroMCElSlwEhSeoyICRJXf8fTBZcxWvQ7foAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# count of the target variable\n",
    "sns.countplot(x='quality', data=df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x10a02b510>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmQAAAF3CAYAAAALu1cUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGQVJREFUeJzt3XuwZWdZJvDnTTchkEkA7dbWXOgoASc6KNpGNAqMoCaISQ2CkziAMgxRi4DipSsWUyg4M1XTYnkFnQwKKAqGKBqsaFCRy4BgOhCBJKaqDYF0h2MCyB0MSd754+zo4aQvuyHrfOec/ftVndp7rfWdvZ/e1dV58n1rr1XdHQAAxjlmdAAAgEWnkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAy2dXSAo7Vt27beuXPn6BgAAEd09dVXf6i7tx9p3IYrZDt37szevXtHxwAAOKKqev884yxZAgAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADLbhbi6+Ue3evTtLS0vZsWNH9uzZMzoOALCOKGRrZGlpKQcOHBgdAwBYhyxZAgAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAw2WSGrqt+pqlur6r2HOF5V9WtVta+q3l1V3zhVFgCA9WzKGbKXJzn7MMfPSXL67OfCJL85YRYAgHVrskLW3W9O8pHDDDkvye/2srcneWBVfcVUeQAA1quR55CdlOTmFdv7Z/sAABbKyEJWB9nXBx1YdWFV7a2qvbfddtvEsQAA1tbIQrY/ySkrtk9OcsvBBnb3Jd29q7t3bd++fU3CAQCslZGF7PIkT5t92/KRST7W3R8cmAcAYIitU71wVb0qyWOSbKuq/Ul+Lsl9kqS7fyvJFUken2Rfkk8nefpUWQAA1rPJCll3X3CE453kWVO9PwDARuFK/QAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAg20dHWAK3/Qzvzs6wj2c8KFPZEuSD3zoE+sq39W/+LTREQBg4ZkhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYbNJCVlVnV9UNVbWvqi4+yPFTq+pvqupdVfXuqnr8lHkAANajyQpZVW1J8uIk5yQ5I8kFVXXGqmH/Pcml3f2IJOcneclUeQAA1qspZ8jOTLKvu2/s7tuTvDrJeavGdJITZ88fkOSWCfMAAKxLWyd87ZOS3Lxie3+Sb1k15ueTvL6qnp3k+CSPmzAPAMC6NOUMWR1kX6/aviDJy7v75CSPT/J7VXWPTFV1YVXtraq9t9122wRRAQDGmbKQ7U9yyortk3PPJclnJLk0Sbr7b5Mcl2Tb6hfq7ku6e1d379q+fftEcQEAxpiykF2V5PSqOq2qjs3ySfuXrxrzgSSPTZKq+vdZLmSmwACAhTJZIevuO5JclOTKJNdn+duU11bVC6vq3Nmwn0ryzKr6+ySvSvLD3b16WRMAYFOb8qT+dPcVSa5Yte/5K55fl+SsKTMAAKx3rtQPADCYQgYAMJhCBgAwmEIGADCYQgYAMJhCBgAwmEIGADDYpNchg5F2796dpaWl7NixI3v27BkdBwAOSSFj01paWsqBAwdGxwCAI7JkCQAwmEIGADCYQgYAMJhCBgAwmEIGADCYQgYAMJhCBgAwmEIGADCYQgYAMJhCBgAwmEIGADCYQgYAMJhCBgAw2BELWVXtrapnVdWD1iLQZnXXscfnzvuemLuOPX50FABgndk6x5jzkzw9yVVVtTfJy5K8vrt70mSbzKdO/+7REQCAdeqIM2Tdva+7n5fkoUn+IMnvJPlAVb2gqr5k6oAAAJvdPDNkqaqHZ3mW7PFJ/ijJ7yf59iRvSPINk6Vjw/jAC//D6Aj3cMdHviTJ1tzxkfevq3ynPv89oyMAsM4csZBV1dVJPprkt5Nc3N3/Mjv0jqo6a8pwAACLYJ4Zsid3940rd1TVad39vu5+4kS5AAAWxjyXvbhszn0AAHwBDjlDVlVfk+RrkzygqlbOhJ2Y5LipgwEALIrDLVk+LMkTkjwwyfet2P+JJM+cMhQAwCI5ZCHr7j9N8qdV9a3d/bdrmAkAYKEcbslyd3fvSfKDVXXB6uPd/ZxJkwEALIjDLVleP3vcuxZBAAAW1eGWLF83e3zF2sUBAFg8h1uyfF2SQ96vsrvPnSQRAMCCOdyS5Ytmj09MsiPJK2fbFyS5acJMAAAL5XBLlm9Kkqr6he5+1IpDr6uqN0+eDABgQcxzpf7tVfVVd29U1WlJtk8XCQBgscxzL8vnJnljVd19P8udSX5kskQAAAvmiIWsu/+iqk5P8jWzXf/Q3f8ybSwAgMVxuG9Zfmd3v2HVfSyT5KurKt39xxNnAwBYCIebIXt0kjfk8+9jebdOopCxrm077q4kd8weAWD9Oty3LH9u9vj0tYsD956ffvhHR0cAgLkc8VuWVfW/quqBK7YfVFX/Y9pYAACLY57LXpzT3f861dDd/5zk8dNFAgBYLPMUsi1Vdd+7N6rqfknue5jxAAAchXkK2SuT/HVVPaOq/muSv0wy1w3Hq+rsqrqhqvZV1cWHGPMDVXVdVV1bVX8wf3QAgM1hnuuQ7amq9yR5bJJK8gvdfeWRfq+qtiR5cZLvSrI/yVVVdXl3X7dizOlJfjbJWd39z1X1ZV/gnwMAYMOa50r96e4/T/LnR/naZybZ1903JklVvTrJeUmuWzHmmUlePDsvLd1961G+BwDAhjfPtywfWVVXVdUnq+r2qrqzqj4+x2uflOTmFdv7Z/tWemiSh1bVW6vq7VV19vzRAQA2h3lmyH4jyflJXpNkV5KnJXnIHL9XB9nXB3n/05M8JsnJSd5SVV+38ludSVJVFya5MElOPfXUOd4aAGDjmOek/nT3viRbuvvO7n5Zkv84x6/tT3LKiu2Tk9xykDF/2t2f6+73JbkhywVt9ftf0t27unvX9u3b54kMALBhzDND9umqOjbJNVW1J8kHkxw/x+9dleT0qjotyYEsz7L94Koxf5LkgiQvr6ptWV7CvHHe8MD6snv37iwtLWXHjh3Zs2fP6DgAG8Y8M2RPnY27KMmnsjzr9f1H+qXuvmP2O1cmuT7Jpd19bVW9sKrOnQ27MsmHq+q6JH+T5Ge6+8NH/8cA1oOlpaUcOHAgS0tLo6MAbCjzXPbi/bOnn03ygqN58e6+IskVq/Y9f8XzTvKTsx8AgIU01zlkAABMRyEDABhsrgvDJklVHd/dn5oyDDC/s379rNER7uHYjx6bY3JMbv7ozesq31uf/dbREQAOa54Lw37b7KT762fbX19VL5k8GQDAgphnyfKXk3xPkg8nSXf/fZJHTRkKAGCRzHth2JtX7bpzgiwAAAtpnnPIbq6qb0vSswvEPiez5UuAlfr+nbtyV/r+q++SBsDhzFPIfjTJr2b5xuD7k7w+ybOmDAVsTJ8763OjIwBsSPNcGPZDSf7LGmQBAFhIhyxkVfXrSQ657tDdz5kkEQDAgjncDNneNUsBALDADlnIuvsVaxkEAGBRHW7J8le6+yeq6nU5yNJld587aTIAgAVxuCXL35s9vmgtggAALKrDLVlePXv6Dd39qyuPVdWPJ3nTlMEAABbFPFfq/6GD7PvhezkHAMDCOtw5ZBck+cEkp1XV5SsOnZDZfS0BAPjiHe4csrcl+WCSbUl+acX+TyR595ShAAAWyeHOIXt/kvcn+da1iwMAsHiOeA5ZVT2yqq6qqk9W1e1VdWdVfXwtwgEALIJ5bi7+G0nOT/KaJLuSPC3JQ6YMBcB8du/enaWlpezYsSN79uwZHQf4As1TyNLd+6pqS3ffmeRlVfW2iXMBMIelpaUcOHBgdAzgizRPIft0VR2b5Jqq2pPlE/2PnzYWAMDimKeQPTXJliQXJXluklOSfP+UoQAA7rYIS/NHLGSzb1smyWeSvGDaOADr15se9ejREe7hM1u3JFX5zP796yrfo9/sZi7cexZhaf5wF4Z9Tw5yU/G7dffDJ0kEALBgDjdD9oQ1SwEAsMCOdGHYJElVfXmSb55t/l133zp1MACARTHPhWF/IMnfJXlykh9I8o6qetLUwQAAFsU837J8XpJvvntWrKq2J/mrJJdNGQyAI3tg9+c9AhvTPIXsmFVLlB/OHDNrAEzvKXfeNToCcC+Yp5D9RVVdmeRVs+3/nOSK6SIBACyWea5D9jNV9cQk356kklzS3a+dPBkAwII4YiGrqucmeU13//Ea5AEAWDjznAt2YpIrq+otVfWs2SUwAAC4lxyxkHX3C7r7a5M8K8lXJnlTVf3V5MkAABbE0Xxb8tYkS1n+luWXTRMHAGDxzHNh2B+rqjcm+esk25I8030sAQDuPfNc9uLBSX6iu6+ZOgwAwCKa57IXF69FEACAReWK+wAAgylkAACDKWQAAIMpZAAAg83zLUsAYGb37t1ZWlrKjh07smfPntFx2CQUMgA4CktLSzlw4MDoGJP5n0950ugI9/CRWz+2/Lj0wXWV73mvvOxeey1LlgAAgylkAACDTVrIqursqrqhqvZV1SEvMFtVT6qqrqpdU+YBAFiPJitkVbUlyYuTnJPkjCQXVNUZBxl3QpLnJHnHVFkAANazKWfIzkyyr7tv7O7bk7w6yXkHGfcLSfYk+eyEWQAA1q0pv2V5UpKbV2zvT/ItKwdU1SOSnNLdf1ZVPz1hFgA2oN/4qdeNjnAPH/3Qp/71cT3lu+iXvm90BL4IU86Q1UH29b8erDomyS8n+akjvlDVhVW1t6r23nbbbfdiRACA8aYsZPuTnLJi++Qkt6zYPiHJ1yV5Y1XdlOSRSS4/2In93X1Jd+/q7l3bt2+fMDIAwNqbspBdleT0qjqtqo5Ncn6Sy+8+2N0f6+5t3b2zu3cmeXuSc7t774SZAADWnckKWXffkeSiJFcmuT7Jpd19bVW9sKrOnep9AQA2mklvndTdVyS5YtW+5x9i7GOmzAIAsF65lyUAHIXjjz3x8x7h3qCQAcBROOurnzg6ApuQe1kCAAymkAEADKaQAQAMppABAAzmpH4AYF07bssxn/e4GSlkAMC69ogvPWF0hMlt3qoJALBBKGQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAgylkAACDKWQAAIMpZAAAg01ayKrq7Kq6oar2VdXFBzn+k1V1XVW9u6r+uqoePGUeAID1aLJCVlVbkrw4yTlJzkhyQVWdsWrYu5Ls6u6HJ7ksyZ6p8gAArFdTzpCdmWRfd9/Y3bcneXWS81YO6O6/6e5PzzbfnuTkCfMAAKxLUxayk5LcvGJ7/2zfoTwjyZ9PmAcAYF3aOuFr10H29UEHVj0lya4kjz7E8QuTXJgkp5566r2VDwBgXZhyhmx/klNWbJ+c5JbVg6rqcUmel+Tc7v6Xg71Qd1/S3bu6e9f27dsnCQsAMMqUheyqJKdX1WlVdWyS85NcvnJAVT0iyf/Jchm7dcIsAADr1mSFrLvvSHJRkiuTXJ/k0u6+tqpeWFXnzob9YpJ/l+Q1VXVNVV1+iJcDANi0pjyHLN19RZIrVu17/ornj5vy/QEANgJX6gcAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhMIQMAGEwhAwAYTCEDABhs0kJWVWdX1Q1Vta+qLj7I8ftW1R/Ojr+jqnZOmQcAYD2arJBV1ZYkL05yTpIzklxQVWesGvaMJP/c3Q9J8stJ/vdUeQAA1qspZ8jOTLKvu2/s7tuTvDrJeavGnJfkFbPnlyV5bFXVhJkAANadKQvZSUluXrG9f7bvoGO6+44kH0vypRNmAgBYd6q7p3nhqicn+Z7u/m+z7acmObO7n71izLWzMftn2/84G/PhVa91YZILZ5sPS3LDJKGnty3Jh0aHWDA+87XnM197PvO15zNfexv1M39wd28/0qCtEwbYn+SUFdsnJ7nlEGP2V9XWJA9I8pHVL9TdlyS5ZKKca6aq9nb3rtE5FonPfO35zNeez3zt+czX3mb/zKdcsrwqyelVdVpVHZvk/CSXrxpzeZIfmj1/UpI39FRTdgAA69RkM2TdfUdVXZTkyiRbkvxOd19bVS9Msre7L0/y20l+r6r2ZXlm7Pyp8gAArFdTLlmmu69IcsWqfc9f8fyzSZ48ZYZ1ZsMvu25APvO15zNfez7zteczX3ub+jOf7KR+AADm49ZJAACDKWQTq6rjqurvqurvq+raqnrB6EyLoqq2VNW7qurPRmdZBFV1U1W9p6quqaq9o/Msgqp6YFVdVlX/UFXXV9W3js60mVXVw2Z/v+/++XhV/cToXJtdVT139t/P91bVq6rquNGZpmDJcmKzOw8c392frKr7JPl/SX68u98+ONqmV1U/mWRXkhO7+wmj82x2VXVTkl3dvRGvE7QhVdUrkrylu186+zb7/bv7o6NzLYLZ7QEPJPmW7n7/6DybVVWdlOX/bp7R3Z+pqkuTXNHdLx+b7N5nhmxiveyTs837zH604IlV1clJvjfJS0dngSlU1YlJHpXlb6unu29XxtbUY5P8ozK2JrYmud/seqX3zz2vabopKGRrYLZ0dk2SW5P8ZXe/Y3SmBfArSXYnuWt0kAXSSV5fVVfP7q7BtL4qyW1JXjZbmn9pVR0/OtQCOT/Jq0aH2Oy6+0CSFyX5QJIPJvlYd79+bKppKGRroLvv7O5vyPLdCs6sqq8bnWkzq6onJLm1u68enWXBnNXd35jknCTPqqpHjQ60yW1N8o1JfrO7H5HkU0kuHhtpMcyWh89N8prRWTa7qnpQkvOSnJbkK5McX1VPGZtqGgrZGpotJ7wxydmDo2x2ZyU5d3ZO06uTfGdVvXJspM2vu2+ZPd6a5LVJzhybaNPbn2T/ihn3y7Jc0JjeOUne2d3/NDrIAnhckvd1923d/bkkf5zk2wZnmoRCNrGq2l5VD5w9v1+W/3L9w9hUm1t3/2x3n9zdO7O8rPCG7t6U/0e1XlTV8VV1wt3Pk3x3kveOTbW5dfdSkpur6mGzXY9Nct3ASIvkgliuXCsfSPLIqrr/7Etyj01y/eBMk5j0Sv0kSb4iyStm38g5Jsml3e0yDGw2X57ktcv/XmZrkj/o7r8YG2khPDvJ78+W0G5M8vTBeTa9qrp/ku9K8iOjsyyC7n5HVV2W5J1J7kjyrmzSK/a77AUAwGCWLAEABlPIAAAGU8gAAAZTyAAABlPIAAAGU8gAZqpqZ1W9d/Z8V1X92uz5Y6pqU16MElgfXIcM4CC6e2+SvbPNxyT5ZJK3DQsEbGpmyIBNoaqeV1U3VNVfVdWrquqnq+qNVbVrdnzb7HZad8+EvaWq3jn7ucfs12xW7M+qameSH03y3Kq6pqq+o6reV1X3mY07sapuunsb4AthhgzY8Krqm7J8m6xHZPnftXcmOdzN5W9N8l3d/dmqOj3Lt8HZdbCB3X1TVf1Wkk9294tm7/fGJN+b5E9m7/tHs/vsAXxBzJABm8F3JHltd3+6uz+e5PIjjL9Pkv9bVe9J8pokZxzl+700/3aboqcnedlR/j7A5zFDBmwWB7sP3B35t//xPG7F/ucm+ackXz87/tmjeqPut86WPR+dZEt3u5E68EUxQwZsBm9O8p+q6n5VdUKS75vtvynJN82eP2nF+Ack+WB335XkqUm2HOH1P5HkhFX7fjfLS51mx4AvmkIGbHjd/c4kf5jkmiR/lOQts0MvSvJjVfW2JNtW/MpLkvxQVb09yUOTfOoIb/G6LBe+a6rqO2b7fj/Jg7JcygC+KNV9sFl+gI2rqn4+K07Cn+g9npTkvO5+6lTvASwO55ABHKWq+vUk5yR5/OgswOZghgwAYDDnkAEADKaQAQAMppABAAymkAEADKaQAQAMppABAAz2/wH1jJQLjvytyAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 720x432 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Here we see that its quite a downing trend in the volatile acidity as we go higher the quality \n",
    "fig = plt.figure(figsize = (10,6))\n",
    "sns.barplot(x = 'quality', y = 'volatile acidity', data = df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. 预处理 (Preprocessing)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 目标：品质分类（好/不好）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Classified into two categories: tasty/not tasty\n",
    "def isTasty(quality):\n",
    "    if quality >= 6.5:\n",
    "        return 1\n",
    "    else:\n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([u'fixed acidity', u'volatile acidity', u'citric acid',\n",
       "       u'residual sugar', u'chlorides', u'free sulfur dioxide',\n",
       "       u'total sulfur dioxide', u'density', u'pH', u'sulphates', u'alcohol',\n",
       "       u'quality', u'tasty'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['tasty'] = df['quality'].apply(isTasty)\n",
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1382\n",
       "1     217\n",
       "Name: tasty, dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['tasty'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征值X和目标值y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now seperate the dataset as target variable and feature variables\n",
    "X = df.drop(['quality', 'tasty'], axis = 1)\n",
    "y = df['tasty']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fixed acidity</th>\n",
       "      <th>volatile acidity</th>\n",
       "      <th>citric acid</th>\n",
       "      <th>residual sugar</th>\n",
       "      <th>chlorides</th>\n",
       "      <th>free sulfur dioxide</th>\n",
       "      <th>total sulfur dioxide</th>\n",
       "      <th>density</th>\n",
       "      <th>pH</th>\n",
       "      <th>sulphates</th>\n",
       "      <th>alcohol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.88</td>\n",
       "      <td>0.00</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.098</td>\n",
       "      <td>25.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>0.9968</td>\n",
       "      <td>3.20</td>\n",
       "      <td>0.68</td>\n",
       "      <td>9.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7.8</td>\n",
       "      <td>0.76</td>\n",
       "      <td>0.04</td>\n",
       "      <td>2.3</td>\n",
       "      <td>0.092</td>\n",
       "      <td>15.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>0.9970</td>\n",
       "      <td>3.26</td>\n",
       "      <td>0.65</td>\n",
       "      <td>9.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.2</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.56</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.075</td>\n",
       "      <td>17.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.9980</td>\n",
       "      <td>3.16</td>\n",
       "      <td>0.58</td>\n",
       "      <td>9.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7.4</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.9</td>\n",
       "      <td>0.076</td>\n",
       "      <td>11.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>0.9978</td>\n",
       "      <td>3.51</td>\n",
       "      <td>0.56</td>\n",
       "      <td>9.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \\\n",
       "0            7.4              0.70         0.00             1.9      0.076   \n",
       "1            7.8              0.88         0.00             2.6      0.098   \n",
       "2            7.8              0.76         0.04             2.3      0.092   \n",
       "3           11.2              0.28         0.56             1.9      0.075   \n",
       "4            7.4              0.70         0.00             1.9      0.076   \n",
       "\n",
       "   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \\\n",
       "0                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "1                 25.0                  67.0   0.9968  3.20       0.68   \n",
       "2                 15.0                  54.0   0.9970  3.26       0.65   \n",
       "3                 17.0                  60.0   0.9980  3.16       0.58   \n",
       "4                 11.0                  34.0   0.9978  3.51       0.56   \n",
       "\n",
       "   alcohol  \n",
       "0      9.4  \n",
       "1      9.8  \n",
       "2      9.8  \n",
       "3      9.8  \n",
       "4      9.4  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1382\n",
       "1     217\n",
       "Name: tasty, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据集合拆分：训练train、验证validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.33, random_state=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train.shape: (1071, 11)\n",
      "X_val.shape: (528, 11)\n",
      "y_train.shape: (1071,)\n",
      "y_val.shape: (528,)\n"
     ]
    }
   ],
   "source": [
    "print 'X_train.shape:', X_train.shape\n",
    "print 'X_val.shape:', X_val.shape\n",
    "print 'y_train.shape:', y_train.shape\n",
    "print 'y_val.shape:', y_val.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据标准化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "# Applying Standard scaling to get optimized result\n",
    "sc = StandardScaler()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = sc.fit_transform(X_train)\n",
    "X_val = sc.fit_transform(X_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. 模型选择与训练 (Model Selection and Training)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "随机森林分类器：\n",
    "https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "rfc = RandomForestClassifier(n_estimators=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rfc.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. 模型预测 (Predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = rfc.predict(X_val)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5. 性能指标 (Performance Matrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix, accuracy_score, classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[451   7]\n",
      " [ 43  27]]\n"
     ]
    }
   ],
   "source": [
    "# Confusion matrix for the random forest classification\n",
    "print(confusion_matrix(y_val, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9053030303030303\n"
     ]
    }
   ],
   "source": [
    "# Accuracy score\n",
    "print(accuracy_score(y_val, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.91      0.98      0.95       458\n",
      "          1       0.79      0.39      0.52        70\n",
      "\n",
      "avg / total       0.90      0.91      0.89       528\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# classification report\n",
    "print(classification_report(y_val, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
